cd "<INSERT DIRECTORY NAME>"

*Prepare health insurance for later years data
import excel using "Medicaid inflation for recent years_2019.xlsx", sheet("table_form") cellrange(b1:n205) firstrow clear
reshape long my_mvcaid_, i(state elderly child disabled mcaid) j(year)
replace year = year + 1
rename state statefip
save "Medicaid_2012_2018_reshaped.dta", replace

import excel using "Medicare inflation for recent years_2019.xlsx", sheet("table_form") cellrange(b1:l103) firstrow clear
reshape long my_mvcare_, i(state elderly mcare) j(year)
replace year = year + 1
rename state statefip
save "Medicare_2012_2018_reshaped.dta", replace

*Bring in IPUMS data
use cps_00086, clear
gen recordNum = _n
recode incwage (. 99999999 99999998 = 0)
recode incbus (. 99999999 99999998 = 0)
recode incfarm (. 99999999 99999998 = 0)
recode incunern (. 99999 99998 = 0)
recode incss (. 999999 = 0)
recode incwelfr (. 999999 99999 = 0)
recode incgov (. 99999 = 0)
recode incidr (. 99999 = 0)
recode incaloth (. 99999 = 0)
recode incretir (. 99999999 = 0)
recode incssi (. 999999 = 0)
recode incdrt (. 99999 = 0)
recode incint (. 9999999 = 0)
recode incunemp (. 999999 = 0)
recode incwkcom (. 999999 = 0)
recode incvet (. 9999999 = 0)
recode incsurv (. 9999999 = 0)
recode incdisab (. 9999999 = 0)
recode incdivid (. 9999999 = 0)
recode incrent (. 9999999 = 0)
recode inceduc (. 999999 = 0)
recode incchild (. 999999 = 0)
recode incalim (. 999999 = 0)
recode incasist (. 9999999 = 0)
recode incother (. 9999999 = 0)
recode inctot (. 999999999 999999998 = 0)
recode incrann (. = 0)
recode incpens (. = 0)
recode foodstamp (. = 0)
recode schllunch (. 99999 = 0)
recode houssub (. = 0)
replace houssub = houssub * 12
recode spmcaphous (. = 0)
replace houssub = spmcaphous if year>=2016

*Health insurance
recode emcontrb (. 9999 = 0)
replace pmvcare = uh_mvcare_a1 if pmvcare==. & uh_mvcare_a1>=0
replace pmvcaid = uh_mvcaid_a1 if pmvcaid==. & uh_mvcaid_a1>=0
replace pmvcaid = uh_mvcaid_a2 if pmvcaid==. & uh_mvcaid_a2>=0
replace pmvcaid = uh_mvcaid_a1 if year==2001 | year==2002/*Fix pmvcaid coding problem where equal to Medicare values in these two years*/

*Merge in HI values for 2012-2020
gen child = age<18
gen elderly = age>=65
gen disabled = age<=61 & incss>0 | incssi>0
replace disabled = 1 if age<=64 & himcarely==2
gen mcaid = himcaidly==2
gen mcare = himcare==2

joinby year state elderly mcare using Medicare_2012_2018_reshaped.dta, unmatched(master)
drop _merge
joinby year state elderly child disabled mcaid using Medicaid_2012_2018_reshaped.dta, unmatched(master) update
drop _merge

replace pmvcaid = my_mvcaid_ if year>=2015
replace pmvcare = my_mvcare_ if year>=2015

replace pmvcaid = 0 if pmvcaid==.
replace pmvcare = 0 if pmvcare==.

*Impute 2019 and 2020 employer contributions to health insurance
recode paidgh (21 = 1 "part") (22 = 2 "all") (else = 0 "no"), gen(hi_type)
replace grptyply = 1 if grptyply==2 & year>=2019/*Recode self plus one plan to family plan in 2019...note mistaken coding in IPUMS data*/
replace grptyply = 2 if grptyply==3 & year>=2019/*Recode self plan to self plan in 2019...note mistaken coding in IPUMS data*/
gen log_weeks_work = log(wkswork1)
gen log_emcontrb = log(emcontrb)
reg log_emcontrb i.hi_type i.grptyply i.fullpart log_weeks_work if emcontrb>=0 & emcontrb<9999 & grpownly==2 & year==2018
predict log_emcontrb_pred if grpownly==2 & year>=2019
replace emcontrb = exp(log_emcontrb_pred) * 480.709 / 471.609 if year==2019/*Medical inflation available at https://fred.stlouisfed.org/series/CPIMEDSL*/
replace emcontrb = exp(log_emcontrb_pred) * 489.665 / 471.609 if year==2020
replace emcontrb = 0 if emcontrb==.

save cps_all_0, replace

/******************************************************************************************
*******************************************************************************************
**Add 1980-1991 in-kind benefits**
*******************************************************************************************
*******************************************************************************************/

import sas "noncash_benefits.sas7bdat", clear

bysort year: gen IPUMS_record = _n
save cps_in_kind, replace

use cps_all_0, clear
bysort year: gen IPUMS_record = _n
merge 1:1 year IPUMS_record using cps_in_kind, nogen

replace pmvcaid = mvcaid if year>=1980 & year<=1991
replace pmvcare = mvcare if year>=1980 & year<=1991
replace emcontrb = emcont if year>=1980 & year<=1991
replace houssub = fhoussub * 12 if year>=1980 & year<=1991
replace schllunch = fmvsl if year>=1980 & year<=1991
replace foodstamp = fmvfs if year>=1980 & year<=1991

*Ensure all family members have same value of food stamps, school lunch, and housing assistance
foreach x in foodstamp schllunch houssub {
	bysort year serial famunit: egen max_`x' = max(`x')
	replace `x' = max_`x'
}

save cps_all, replace


/******************************************
*******************************************
**Supplement pre-1980 data for imputation**
*******************************************
*******************************************/

use cps_all, clear
keep if year<=1980/*will use 1980 for models/values, need to impute in all years prior*/

/*Number of children, adults, and people in each family*/
bysort year serial famunit: egen num_kids = sum(child)
replace num_kids = uh_child18_a1 if year<=1967/*Kids not separate observations in pre-1968, so have to use variable indicating number under 18*/
gen adult=age>=18
bysort year serial famunit: egen num_adults = sum(adult)/*Number of adults in family*/
gen num_fam = num_kids + num_adults
recode num_kids (6/99=5)
recode num_adults (4/99=3)

/*Generate family weight*/
egen tag_fam = tag(year serial famunit)
bysort year serial: egen sep_fams = sum(tag_fam)
replace num_fam = uh_numper_a1 if year<=1967 & sep_fams==1 & num_fam==7 & uh_numper_a1>7
bysort year serial famunit: egen num_obs = count(famunit)
gen asecwt2 = asecwt
replace asecwt2 = asecwt * num_fam / num_obs if year<=1967/*Adjust for CPS-ASEC excluding younger children in 1967 and earlier*/
bysort year serial famunit: egen fam_weight = sum(asecwt2)

/*Demographics*/
recode fullpart (0 9 = 0) (1 2 = 1), gen(worker)/*Worker if worked last year*/
bysort year serial famunit: egen num_workers = sum(worker)/*Number of workers in family*/
gen married = marst==1
recode educ (1 999 = 0 "unknown/missing/NIU") (2/60 = 1 "less than high school") (72 73 = 2 "high school") (80/100 = 3 "some college") (110/122 = 4 "college"), gen(educ_recode)
recode age (0/18 = 0) (19/22 = 1) (23/25 = 2) (26/30 = 3) (31/35 = 4) (36/40 = 5) (41/50 = 6) (51/60 = 7) (61/65 = 8) (66/99=9), gen(age_recode)

/*Family income*/
recode ftotval (9999999999 .=0)
gen fam_income = ftotval
replace fam_income = 0 if fam_income<0
replace fam_income = 50000 if fam_income > 50000/*Top code income at $50,000 to keep functional form more flexible in this relevant range*/
gen fam_income_2 = fam_income^2
gen fam_income_3 = fam_income^3

/*Welfare/disability income*/
recode incwelfr (. 99999 999999=0)/*Note IPUMS coding issue where using 99999 as NA for 1975 and earlier, but 999999 for 1976 and later*/
gen rec_welfare = incwelf>0 | incssi>0/*Since SSI not separated out from cash welfare before 1976 (survey year)*/


save cps_pre_1980, replace


/**********************************************************************************
***********************************************************************************
***Get list of states and population weights for FIPS codes with multiple states***
***********************************************************************************
**********************************************************************************/

/*Get list of statefip codes with multiple states*/
use cps_pre_1980, clear
keep statefip
sort statefip
drop if statefip==statefip[_n-1]
decode statefip, gen(statefip_string)
save state_list, replace

use state_list, clear
replace statefip_string = regexr(statefip_string,"n dakota", "north dakota")
replace statefip_string = regexr(statefip_string,"s dakota", "south dakota")

gen pos_0=1
gen frag_0="XXXX"
gen diff=0
local i = 0
local ender = 1
while `ender' > 0 {
	local i = `i'+1
	local j = `i'-1
	gen pos_`i' = pos_`j' + strpos(substr(statefip_string,pos_`j',.),"-")-1
	gen frag_`i' = substr(statefip_string,pos_`j',pos_`i'-pos_`j')
	replace pos_`i'=pos_`i'+1
	replace frag_`i' = substr(statefip_string,strrpos(statefip_string,"-")+1,.) if pos_`i'==pos_`j'
	replace frag_`i'="" if frag_`i'==frag_`j' | frag_`j'==""
	replace diff = pos_`i'-pos_`j'
	sum diff
	local ender = `r(sum)'
}
drop pos_* frag_0 diff statefip_string
reshape long frag_, i(statefip) j(tempy)
drop if frag_==""
drop tempy
rename frag_ statename 
save state_key, replace
keep if statefip<=56
rename statefip statefip2
save state_key_short, replace
merge 1:m statename using state_key, nogen
/*State code 99 is unidentified state so merge in every state*/
append using state_key_short, gen(appended)
drop if statefip==99
replace statefip=99 if appended==1
drop appended
save state_key, replace

/*Merge in state populations, from https://www2.census.gov/programs-surveys/popest/tables/1980-1990/state/asrh/st6070ts.txt*/
insheet using "1970_pop.csv", clear
destring pop_1970, replace ignore(",")
replace pop_1970 = 1000 * pop_1970
merge 1:m statefip2 using state_key, nogen
bysort statefip: egen sum_pop_1970 = sum(pop_1970)
gen pop_share = pop_1970 / sum_pop_1970
order statefip statefip2 statename pop_share
sort statefip statefip2 
drop pop_1970 sum_pop_1970
gen always_1=1
save state_key_pop, replace


/***************************************
****************************************
**************Medicare******************
****************************************
****************************************/
set seed 32673675

/*Bring in Medicare spending and enrollees pre-1980*/
import excel "caseloads_pre_1980.xlsx", firstrow sheet("Medicare") clear
drop if year=="Source" | year==""
destring year NHEAMedicareExpenditures, replace
keep year frac1979 NHEAMedicareExpenditures
replace year=year+1/*Put into survey year*/
save medicarePre1980, replace

/*Get 1979 Medicare value by disabled & elderly by state*/
use cps_pre_1980, clear
keep if year==1980 & pmvcare>0

/*Get fraction of disabled and aged Medicare recipients who appear in 1979 CPS (based on disabled and aged definitions applied to 1967-1978)*/
gen disabled_keeper = 0
replace disabled_keeper = 1 if age<=62 & incss>0 & incss<99999
replace disabled_keeper = 0 if age>=61 & age<=62 & sex==2 & marst==5
sum asecwt if disabled_keeper==1
scalar medicare_pop_1979_frac_disabled = r(sum)/2910766/*disabled medicare enrollees in 1979, available here: https://www.census.gov/history/pdf/medicare1966-2013.pdf*/

sum asecwt if age>=66
scalar medicare_pop_1979_frac_aged = r(sum)/24548391/*aged medicare enrollees in 1979, available here: https://www.census.gov/history/pdf/medicare1966-2013.pdf*/

/*Get Medicare market values in 1979 by risk class and state*/
gen risk_class = ""
replace risk_class = "aged" if age>=66/*Assuming subtracted year from age to get market values*/
replace risk_class = "disabled" if age<63/*Assuming subtracted year from age to get market values*/
drop if risk_class==""
collapse (mean) pmvcare, by(risk_class statefip)
rename pmvcare pmvcare_1979
rename statefip statefip2
joinby statefip2 using state_key_pop
replace pmvcare_1979 = pmvcare_1979 * pop_share
collapse (sum) pmvcare_1979, by(statefip risk_class)
save medicare_1979, replace


/*Predict Medicare receipt among non-aged people receiving Social Security in 1979*/
use cps_pre_1980, clear
keep if year==1980
gen keeper = 0
replace keeper = 1 if age<=62 & incss>0
replace keeper = 0 if age>=61 & age<=62 & sex==2 & marst==5
gen medicare_receipt = pmvcare>0

probit medicare_receipt i.age i.sex i.race i.educ_recode i.married if keeper==1 [pweight=asecwt]
est store reg_medicare

/*Bring in pre-1980 CPS data, and Medicare spending per capita*/
use cps_pre_1980, clear
drop pmvcare
keep if year<=1979
gen risk_class=""
replace risk_class="aged" if age>=66 
replace risk_class="disabled" if age<=62 & incss>0 & year>=1974
/*According to this website, disabled don't get benefits until 1973: https://www.ssa.gov/history/1972amend.html*/
replace risk_class="" if age>=61 & age<=62 & sex==2 & marst==5
est restore reg_medicare
predict prob_medicare
replace prob_medicare = 0 if risk_class!="disabled"
gen num_medicare = asecwt * prob_medicare
gen medicare_disabled = .
capture program drop impute_medicare
program define impute_medicare
	sum num_medicare if year==`1'+1
	scalar factor_prob = `2' * medicare_pop_1979_frac_disabled/r(sum)
	replace prob_medicare = min(1,prob_medicare * factor_prob) if year==`1'+1
	replace medicare_disabled = rbinomial(1,prob_medicare) if year==`1'+1
	replace risk_class = "" if risk_class=="disabled" & medicare_disabled==0 & year==`1'+1
	sum asecwt if risk_class=="disabled" & year==`1'+1
	scalar diff_sums = r(sum) - `2' * medicare_pop_1979_frac_disabled
	
	while diff_sums<-5000 {
		local j = max(1,round(abs(diff_sums/5000)))
		gsample `j' [w=prob_medicare] if risk_class!="disabled" & year==`1'+1, wor gen(add_medicare)
		replace risk_class = "disabled" if add_medicare==1
		drop add_medicare
		sum asecwt if risk_class=="disabled" & year==`1'+1
		scalar diff_sums = r(sum) - `2' * medicare_pop_1979_frac_disabled
	}
	while diff_sums>5000 {
		local j = max(1,round(abs(diff_sums/5000)))
		gsample `j' if risk_class=="disabled" & year==`1'+1, wor gen(add_medicare)
		replace risk_class = "" if add_medicare==1
		drop add_medicare
		sum asecwt if risk_class=="disabled" & year==`1'+1
		scalar diff_sums = r(sum) - `2' * medicare_pop_1979_frac_disabled
	}
end

impute_medicare 1973 1730538 
impute_medicare 1974 1928116 
impute_medicare 1975 2168393 
impute_medicare 1976 2392158 
impute_medicare 1977 2619373 
impute_medicare 1978 2793206 
/*https://www.census.gov/history/pdf/medicare1966-2013.pdf*/

merge m:1 risk_class statefip using medicare_1979, nogen

merge m:1 year using medicarePre1980, nogen
gen pmvcare = pmvcare_1979 / frac1979
replace pmvcare = 0 if pmvcare==. | year<=1966
keep if year<=1979
drop risk_class
save cps_pre_1980_medicare, replace


/***************************************
****************************************
**************Medicaid******************
****************************************
****************************************/
set seed 12673675

/*Bring in Medicaid introduction dates by state, with fraction of population covered by Medicaid in each year for FIPS codes with multiple states attached*/
import excel "caseloads_pre_1980.xlsx", firstrow sheet("Medicaid_intro") clear
keep statename medicaid_intro
merge 1:m statename using state_key_pop, nogen
collapse (sum) pop_share, by(statefip medicaid_intro)
sort statefip medicaid_intro
bysort statefip: gen cum_pop_share = sum(pop_share)
drop pop_share
rename cum_pop_share pop_share
bysort statefip: egen seq_num = seq()
reshape wide medicaid_intro pop_share, i(statefip) j(seq_num)
save medicaid_intro, replace

/*Bring in Medicaid spending and enrollees pre-1980*/
import excel "caseloads_pre_1980.xlsx", firstrow sheet("Medicaid") clear
drop if year=="" | year=="Source"
destring year, replace
keep year frac_1979
rename frac_1979 frac_mcaid_1979
replace year=year+1/*Put into survey year*/
save medicaidPre1980, replace

/*Get 1979 Medicaid value by risk class by state*/
use cps_pre_1980, clear
keep if year==1980 & pmvcaid>0

sum asecwt
scalar medicaid_pop_1979_frac = r(sum)/19555038 

/*Can't do disabled separately because can't identify disabled pre-1980 survey year*/
gen risk_class = ""
replace risk_class = "aged" if age>=66/*Assuming subtracted year from age to get market values*/
replace risk_class = "child" if age<=18/*Assuming subtracted year from age to get market values*/
replace risk_class = "adult" if age>=19 & age<=65/*Assuming subtracted year from age to get market values*/
drop if risk_class==""
collapse (mean) pmvcaid [aweight=asecwt], by(risk_class statefip)
rename pmvcaid pmvcaid_1979
rename statefip statefip2
joinby statefip2 using state_key_pop
replace pmvcaid_1979 = pmvcaid_1979 * pop_share
collapse (sum) pmvcaid_1979, by(statefip risk_class)
save medicaid_1979, replace

/*Bring in pre-1980 CPS data (income year), and Medicaid spending per capita*/
use cps_pre_1980, clear
drop pmvcaid

gen risk_class=""
replace risk_class = "aged" if age>=66
replace risk_class = "child" if age<=18
replace risk_class = "adult" if age>=19 & age<=65

bysort year serial: egen fam_welf = max(incwelfr)
replace fam_welf = 0 if age>=19 & incwelfr==0
replace risk_class = "" if fam_welf==0 & incssi==0 

/*put households in random order*/
gen rand_ind = runiform()/*random number for each person*/
egen tagged = tag(year serial)/*tag one person per household*/
gen rand_hh_temp = -1
replace rand_hh_temp = rand_ind if tagged==1
bysort year serial: egen rand_hh = max(rand_hh_temp)/*Use one individual's random number for whole household*/
sort year statefip rand_hh/*In each year and state code, households in random order*/

gen medicaid_pop = asecwt
replace medicaid_pop = 0 if risk_class==""
bysort year statefip: egen sum_medicaid_pop = total(medicaid_pop)
gen frac_medicaid_pop = medicaid_pop / sum_medicaid_pop
bysort year statefip: gen cum_medicaid_pop = sum(frac_medicaid_pop)
bysort year statefip serial: egen max_cum_medicaid_pop = max(cum_medicaid_pop)/*Don't want to cut household in half, so use the cumulative for the whole household for each person in it*/

/*Merge in Medicaid introduction dates*/
merge m:1 statefip using medicaid_intro
drop if _merge==2
drop _merge

gen medicaid_receipt=0
forvalues i = 1/7 {
	replace medicaid_receipt = 1 if risk_class!="" & year-1>=medicaid_intro`i' & max_cum_medicaid_pop<=pop_share`i'
}
replace risk_class = "" if medicaid_receipt==0

merge m:1 risk_class statefip using medicaid_1979, nogen

merge m:1 year using medicaidPre1980, nogen
gen pmvcaid = pmvcaid_1979 * frac_mcaid_1979
replace pmvcaid = 0 if pmvcaid==.
keep if year<=1979
save cps_pre_1980_medicaid, replace


/***************************************
****************************************
**********Employer-Provided*************
****************************************
****************************************/
set seed 82673675

/*Predict employer contributions in 1979*/
use cps_pre_1980, clear
keep if year==1980
rename statefip statefip2
/*Employer contributions*/
bysort year serial famunit: egen emcontrb_fam = sum(emcontrb)
gen emp_receipt = emcontrb_fam>0

/*Model for family receipt of employer provided health insurance*/
probit emp_receipt i.age_recode i.sex i.race i.educ_recode i.married i.num_workers i.fullpart if num_workers>0 & relate==101 [pweight=fam_weight]
est store reg_emp
/*Model for total family value of employer provided health insurance, based on number of adults/kids and state*/
reg emcontrb_fam i.num_adults i.num_kids i.statefip2 if emcontrb_fam>0 & relate==101 [pweight=fam_weight]
est store reg_emp_value

/*Get ratio of CPS employer contribution dollars to NHEA private dollars*/
sum emcontrb if emcontrb_fam>0 [aweight=asecwt]
scalar avg_emp_spending_1979 = r(mean)
scalar emp_dollars_1979_frac = r(sum)/60955000000

/*Create new dataset with all combinations of parents/kids and states*/
clear
input num_adults
0
1
2
3
end
gen always_1=1
save adults_data, replace
clear
input num_kids
0
1
2
3
4
5
end
gen always_1=1
save kids_data, replace
joinby always_1 using adults_data
joinby always_1 using state_key_pop

/*Predict family values of employer contributions, varying by number of adults, number of kids, and state*/
est restore reg_emp_value
predict emcontrb_1979
replace emcontrb_1979 = emcontrb_1979 * pop_share
collapse (sum) emcontrb_1979, by(num_adults num_kids statefip)
save emp_1979, replace

/*Get number of employer contribution enrollees by year*/
import excel "caseloads_pre_1980.xlsx", firstrow sheet("Employer_HI") clear
drop if year=="" | year=="Source" | year=="Note"
destring year spending, replace
keep year spending cpi_med frac_1979
replace year=year+1/*Put into survey year*/
gen enrollees = spending * emp_dollars_1979_frac / (avg_emp_spending_1979 * frac_1979) 
save privatePre1980, replace

/*Put enrollees into scalars to be called on later*/
forvalues i = 1961/1979 {
	sum enrollees if year==`i'
	scalar enrollees_`i' = r(mean)
}

/*Predict who gets employer contributions each year*/
use cps_pre_1980, clear
est restore reg_emp
predict prob_emp if num_workers>0 & relate==101 
replace prob_emp = 0 if prob_emp==.

gen num_emp = fam_weight * prob_emp
gen emp_receipt = 0
capture program drop impute_emp
program define impute_emp
	sum num_emp if year==`1'
	scalar factor_prob = `2'/r(sum)
	replace prob_emp = min(1,prob_emp * factor_prob) if year==`1'
	replace emp_receipt = rbinomial(1,prob_emp) if year==`1'
	sum fam_weight if emp_receipt==1 & year==`1'
	scalar diff_sums = r(sum) - `2'
	
	while diff_sums<-20000 {
		local j = max(1,round(abs(diff_sums/20000)))
		gsample `j' [w=prob_emp] if emp_receipt==0 & year==`1', wor gen(add_emp)
		replace emp_receipt = 1 if add_emp==1
		drop add_emp
		sum fam_weight if emp_receipt==1 & year==`1'
		scalar diff_sums = r(sum) - `2'
	}
	while diff_sums>20000 {
		local j = max(1,round(abs(diff_sums/20000)))
		gsample `j' if emp_receipt==1 & year==`1', wor gen(add_emp)
		replace emp_receipt = 0 if add_emp==1
		drop add_emp
		sum fam_weight if emp_receipt==1 & year==`1'
		scalar diff_sums = r(sum) - `2'
	}
end

forvalues i = 1964/1979  {
	impute_emp `i' enrollees_`i'
}

/*Merge in employer contribution values by state/number of kids and adults*/
merge m:1 num_adults num_kids statefip using emp_1979
drop if _merge==2
drop _merge
merge m:1 year using privatePre1980
drop if _merge==2
drop _merge
replace emcontrb = 0 if year<=1979
replace emcontrb = emcontrb_1979 * frac_1979 if emp_receipt==1

save cps_pre_1980_emp, replace




/***************************************
****************************************
****Impute other in-kind transfers****
****************************************
****************************************/
set seed 52673675

/*For food stamps, get percent of state population covered in each year using Almond et al. (2016) data*/
clear
gen year=.
set obs 18
replace year = 1960 + _n
gen always_1=1
save just_years, replace

use fsdatabyfips, clear /*Data available here: https://www.aeaweb.org/articles?id=10.1257/aer.20130375*/
rename fs_year year
bysort stfips: egen state_pop = sum(totalpop)
gen pct_pop = totalpop / state_pop
collapse (sum) pct_pop, by(stfips year)
save fs_pops, replace
collapse (sum) pct_pop, by(stfips)
keep stfips
gen always_1=1
joinby always_1 using just_years
drop always_1
merge 1:1 stfips year using fs_pops, nogen
replace pct_pop = 0 if pct_pop==.
sort stfips year
bysort stfips: gen foodstamp_covered = sum(pct_pop)
drop pct_pop
drop if year==.
rename stfips statefip2
append using just_years
replace statefip2=2 if statefip2==./*Add in Alaska since missing in Almond et al. (2016) data*/
replace foodstamp_covered = 1 if year>=1977/*Since data only go through 1976 and Almond et al. note all counties rolled out by 1978..."We limit the sample to 1978 and later, after the FSP has been rolled out in all counties"...p. 919)*/
replace year = year+1

joinby statefip2 using state_key_pop
replace foodstamp_covered = foodstamp_covered * pop_share
collapse (sum) foodstamp_covered, by(statefip year)
gen schllunch_covered = 1
gen houssub_covered = 1
save program_coverage, replace

/*Bring in admin data on caseloads and spending*/
import excel "caseloads_pre_1980.xlsx", firstrow sheet("in_kind") cellrange(a2:g23) clear
replace year = year + 1
drop if year==.
save in_kind_admin, replace

/*Put enrollees and spending into scalars to be called on later*/
foreach x in foodstamp schllunch houssub {
	forvalues i = 1964/1980 {
		sum `x'_enrollees if year==`i'
		scalar `x'_enrollees_`i' = r(mean)
		sum `x'_cost if year==`i'
		scalar `x'_cost_`i' = r(mean)
	}
}

/*Use 1980 CPS to get prediction models*/
use cps_pre_1980, clear
merge m:1 statefip year using program_coverage
drop if _merge==2
drop _merge
merge m:1 year using in_kind_admin, nogen

/*Get percent of program recipients covered by CPS in 1980, so can apply this ratio for other years*/
sum asecwt2 if foodstamp>0 & year==1980
scalar foodstamp_ratio_1980 = r(sum)/foodstamp_enrollees_1980

sum asecwt2 if schllunch>0 & year==1980
scalar schllunch_ratio_1980 = r(sum)/schllunch_enrollees_1980/*In reality, only kids are recipients, so number of people in families with school lunch is much higher than number of kids recipients. So this is adjusting for that fact as well as misreporting in CPS*/

sum asecwt if houssub>0 & tag_fam==1 & year==1980
scalar houssub_ratio_1980 = r(sum)/houssub_enrollees_1980/*Here the admin data is based on number of households. Using number of families here since not everyone in household necessarily covered by subsidy*/

gen fam_weight_foodstamp = fam_weight
gen fam_weight_schllunch = fam_weight
gen fam_weight_houssub = asecwt/*Since want to match number of households in admin data*/


/*Do regressions, predict probability of receipt and value*/
foreach x in foodstamp schllunch houssub {
	bysort year serial famunit: egen `x'_fam = max(`x')
	gen rec_`x' = `x'_fam>0 & `x'_fam!=.
/*Predict probability of receipt*/	
	reg rec_`x' i.age_recode i.num_adults i.num_kids i.rec_welfare i.educ_recode i.sex i.race i.married fam_income fam_income_2 fam_income_3 if year==1980 & relate==101 [pweight=fam_weight_`x']/*Using linear probabilty model because probit wasn't converging*/
	predict prob_`x' if year<=1979 & year>=1968 & relate==101
	reg rec_`x' i.age_recode i.num_adults i.num_kids i.educ_recode i.sex i.race i.married fam_income fam_income_2 fam_income_3 if year==1980 & relate==101 [pweight=fam_weight_`x']/*Using linear probabilty model because probit wasn't converging*/
	predict prob2_`x' if year<=1967 & relate==101
	replace prob_`x' = prob2_`x' if year<=1967
	replace prob_`x' = 0 if prob_`x'==.
	replace prob_`x' = 0 if prob_`x'<0
	replace prob_`x' = 1 if prob_`x'>1
/*Scale probabilities based on percent of state's population covered by program in each year*/
	replace prob_`x' = prob_`x' * `x'_covered
/*Predict value conditional on receipt*/
	reg `x'_fam i.num_adults##c.fam_income i.num_kids##c.fam_income i.num_adults#c.fam_income_2 i.num_kids#c.fam_income_2 i.num_adults#c.fam_income_3 i.num_kids#c.fam_income_3 if rec_`x'==1 & year==1980 & relate==101 [pweight=fam_weight_`x']
	predict value_`x' if year<=1979 & relate==101
	replace value_`x'= 0 if value_`x'==.
	/*Don't allow values of benefits to be below 5th percentile or above 95th percentile*/
	sum `x' if year==1980 & `x'>0, detail
	replace value_`x' = `r(p5)' if value_`x'<`r(p5)' & year<=1979 & relate==101
	replace value_`x' = `r(p95)' if value_`x'>`r(p95)' & year<=1979 & relate==101
	gen num_`x' = fam_weight_`x' * prob_`x'
	gen `x'_receipt = 0
}

capture program drop impute_inkind
program define impute_inkind
	sum num_`3' if year==`1'
	scalar factor_prob = `2'*`3'_ratio_1980/r(sum)
	replace prob_`3' = min(1,prob_`3' * factor_prob) if year==`1'
	replace `3'_receipt = rbinomial(1,prob_`3') if year==`1'
	sum fam_weight_`3' if `3'_receipt==1 & year==`1'
	scalar diff_sums = r(sum) - `2'*`3'_ratio_1980
	
	while diff_sums<-20000 {
		local j = max(1,round(abs(diff_sums/20000)))
		gsample `j' [w=prob_`3'] if `3'_receipt==0 & year==`1', wor gen(add_`3')
		replace `3'_receipt = 1 if add_`3'==1
		drop add_`3'
		sum fam_weight_`3' if `3'_receipt==1 & year==`1'
		scalar diff_sums = r(sum) - `2'*`3'_ratio_1980
	}
	while diff_sums>20000 {
		local j = max(1,round(abs(diff_sums/20000)))
		gsample `j' if `3'_receipt==1 & year==`1', wor gen(add_`3')
		replace `3'_receipt = 0 if add_`3'==1
		drop add_`3'
		sum fam_weight_`3' if `3'_receipt==1 & year==`1'
		scalar diff_sums = r(sum) - `2'*`3'_ratio_1980
	}
end

foreach x in foodstamp schllunch houssub {
forvalues i = 1964/1979 {
	impute_inkind `i' `x'_enrollees_`i' `x'
}
replace value_`x' = 0 if `x'_receipt!=1 | year==1980
replace value_`x' = `x' if year==1980 & tag_fam==1
gen value_`x'_wt = asecwt * value_`x'
bysort year: egen sum_value_`x' = sum(value_`x'_wt)
sum sum_value_`x' if year==1980
replace value_`x' = value_`x' * (`x'_cost / sum_value_`x') * (`r(mean)' / `x'_cost_1980)
}

save cps_pre_1980_inkind, replace

/*Merge all benefits datasets together*/
use cps_pre_1980_medicare, clear
keep year serial pernum pmvcare
merge 1:1 year serial pernum using cps_pre_1980_medicaid, nogen
keep year serial pernum pmvcare pmvcaid
merge 1:1 year serial pernum using cps_pre_1980_emp
keep year serial pernum pmvcare pmvcaid emcontrb
merge 1:1 year serial pernum using cps_pre_1980_inkind
foreach x in foodstamp schllunch houssub {
	drop `x'
	rename value_`x' `x'
}
keep year serial pernum pmvcare pmvcaid emcontrb foodstamp schllunch houssub
foreach x in pmvcare pmvcaid emcontrb foodstamp schllunch houssub {
	rename `x' `x'_pre
}
keep if year>=1964 & year<=1979
save cps_pre_1980_benefits, replace

use cps_all, clear
merge 1:1 year serial pernum using cps_pre_1980_benefits, nogen
foreach x in pmvcare pmvcaid emcontrb foodstamp schllunch houssub {
	replace `x' = `x'_pre if year<=1979
	drop `x'_pre
}
save cps_all2, replace


***********************************************************
***********************************************************
*******************Simulate taxes**************************
***********************************************************
***********************************************************


***********************************************************
******************Create Tax Units*************************
***********************************************************
use cps_all2, clear

bysort year serial famunit: egen familyID = min(recordNum)

* Tax unit unique within year/household, constructed with recordNum because this is unique to each person, and thus is necessarily unique for tax units
* Note that recordNum is unique within year, use this because other variables are not standard across years
gen taxUnit = .

* Place secondary (unrelated) individuals in their own family
replace taxUnit = recordNum if ftype==5

* Place primary households (non-family) in their own tax unit
replace taxUnit = recordNum if ftype==2

* Place primary family members in primary family tax unit
replace taxUnit = familyID if ftype==1

* Place related/unrelated subfamily members into their subfamily
replace taxUnit = familyID if ftype==3 | ftype==4

* Adult children placed in own subfamily if they are not a reference person or spouse of family head
replace taxUnit = recordNum if age>19 & famrel==3 & year>1987
replace taxUnit = recordNum if age>19 & relate==301 & ftype==1 & year<=1987

* Ever-married, younger children are placed in their own subfamily if they are not the reference person or spouse of a family
replace taxUnit = recordNum if age<=19 & (marst==1 | marst==2 | marst==3 | marst==4 | marst==5) & (famrel!=1 & famrel!=2) & year>1987
replace taxUnit = recordNum if age<=19 & (marst==1 | marst==2 | marst==3 | marst==4 | marst==5) & (relate!=101 & relate!=201 & relate!=202 & relate!=203) & ftype==1 & year<=1987

* Find age of oldest taxUnit member
bysort year serial taxUnit: egen tempMaxTUAge = max(age)
bysort year year: egen tempMaxHHAge = max(age)

* Determine if anyone in the taxUnit has ever been married
gen tempEverMarried = 0
replace tempEverMarried = 1 if marst<6
bysort year serial taxUnit: egen tempTUAnyMarried = max(tempEverMarried)

* Place individuals in tax units with heads who are unmarried and < 20 in the HH's primary family
* If there is no primary family, assign to HH's oldest individual
* Individuals < 15 and living alone are dropped, as they have no income. Those 15 and older kept as own taxUnit

gen tempTU = .
replace tempTU = taxUnit if (ftype==1 | ftype==2)
bysort year serial: egen tempPrimaryTU = min(tempTU)

sort year serial recordNum
replace taxUnit = tempPrimaryTU if tempTUAnyMarried == 0 & tempMaxTUAge < 20 & tempMaxHHAge > 19

bysort year serial taxUnit: egen tempMaxTUAge2 = max(age)
gen oldest = 1 if age == tempMaxHHAge
by year serial: egen temp = sum(oldest)
bysort year serial oldest: egen temp2 = min(recordNum)
replace oldest = 0 if temp > 1 & taxUnit != temp2

gen temp3 = 0
replace temp3 = taxUnit if oldest == 1
by year serial: egen tempOldestTU = max(temp3)
replace taxUnit = tempOldestTU if tempTUAnyMarried == 0 & tempMaxTUAge2 < 20 & tempMaxHHAge > 19

bysort year serial taxUnit: gen TUSize = _N
*gen child = age<18
bysort year serial taxUnit: egen children = total(child)
*Correct for exclusion of children 0-13 in before 1968
recode uh_child18_a1 (-9 = .), gen(uh_child18_a1_recode)
bysort year serial taxUnit: egen max_uh_child18_a1 = max(uh_child18_a1_recode)
replace TUSize = TUSize + max(max_uh_child18_a1 - children,0) if year<=1967

***********************************************************
******************Create TaxSim variables******************
***********************************************************

*Create SOI state codes for TaxSim from FIPS codes: https://users.nber.org/~taxsim/statesoi.html
decode statefip, gen(statefip_name)
replace statefip_name = "" if statefip>56
encode statefip_name, gen(state)

keep year serial taxUnit state age recordNum marst inc* TUSize max_uh_child18_a1

* marriage status
gen mstat = 1
replace mstat = 2 if marst==1

*Primary and spouse age
sort year serial taxUnit age
*calling page oldest and sage 2nd oldest in tax unit
by year serial taxUnit: gen page = age[_N]
by year serial taxUnit: gen sage = age[_N-1]
*no spouse age if single
replace sage = 0 if mstat==1
replace sage = 0 if sage==.
replace mstat = 1 if sage==.

*Determine dependents
gen u18 = 0
replace u18 = 1 if age <= 18
gen u17 = 0
replace u17 = 1 if age<=17
gen u13 = 0
replace u13 = 1 if age<=13

bysort year serial taxUnit: egen depx = total(u18)
gen u13_pre_1968 = max(max_uh_child18_a1 - depx,0)
replace depx = depx + u13_pre_1968 if year<=1967
* Do not allow young TU heads or couples to be own dependents
replace depx = max(depx - 1,0) if TUSize == depx & mstat == 1
replace depx = max(depx - 2,0) if TUSize == depx & mstat == 2
*Determine younger age statuses
bysort year serial taxUnit: egen dep13 = total(u13)
replace dep13 = u13_pre_1968 if year<=1967
replace dep13 = depx if dep13>depx
bysort year serial taxUnit: egen dep17 = total(u17)
replace dep17 = dep17 + u13_pre_1968 if year<=1967
replace dep17 = depx if dep17>depx
*By construction dependents under age 18 is dependents
gen dep18 = depx

*Set up wage income
*pwages and swages
gen labor = incwage + incbus + incfarm 

*Determine my primary filer / head in each year
bysort year serial taxUnit: egen tempPrimaryLabor = max(labor)
*Calling the person with the highest income the tax unit head
gen primaryFiler = 0
replace primaryFiler = 1 if labor == tempPrimaryLabor
bysort year serial taxUnit: egen tempNumFilers = total(primaryFiler)
bysort year serial taxUnit: egen tempPrimaryFilerNum = min(recordNum)
*Calling everyone else a non-primary
replace primaryFiler = 0 if tempNumFilers > 1 & recordNum != tempPrimaryFilerNum
drop tempNumFilers
*Making sure exactly 1 primary filer 
bysort year serial taxUnit: egen tempNumFilers = total(primaryFiler)
replace primaryFiler = 1 if tempNumFilers == 0 & recordNum == tempPrimaryFilerNum

* Assume head's wages are their own, all other wages go to spouse
* Negative wages set to zero since Taxsim does not accept negative wages
* [Note that this will work because deleting all non-heads later on]
rename labor pwages
replace pwages = 0 if pwages < 0
bysort year serial taxUnit: egen tempTotalTUWages = total(pwages)
gen swages = tempTotalTUWages - pwages
replace swages = 0 if swages < 0
*If single, setting own wages to be all wages in tax unit
replace pwages = pwages + swages if mstat==1
replace swages = 0 if mstat==1
	

*SET UP ALL OTHER INDIVIDUAL INCOME SOURCES. This is the order they appear in taxsim	
*dividends
gen p_div = 0
replace p_div = incdrt if year>=1976 & year<=1987 // Dividends were with rents in this period (and with interest below before that)
replace p_div = incdivid if year>=1988 
bysort year serial taxUnit: egen dividends = total(p_div)

*interc
gen p_int = 0
replace p_int = incidr if year>=1968 & year<=1975
replace p_int = incint if year>=1976
bysort year serial taxUnit: egen intrec = total(p_int)
*capital gains (no info)
gen stcg = 0
gen ltcg = 0
*otherprop
gen p_prop = 0
replace p_prop = incrent if year>=1988
bysort year serial taxUnit: egen otherprop = total(p_prop)
*nonprop
gen p_nonprop = 0
replace p_nonprop = incunern if year<=1967
replace p_nonprop = incaloth if year>=1968 & year<=1975
replace p_nonprop = incaloth if year>=1976 & year<=1987
replace p_nonprop = incalim + inceduc + incother if year>=1988
bysort year serial taxUnit: egen nonprop = total(p_nonprop)
*pensions
gen p_retire = 0
replace p_retire = incretir if year>=1976
bysort year serial taxUnit: egen pensions = total(p_retire)
* Social Security = incss in all years
bysort year serial taxUnit: egen gssi = total(incss)
* ui
gen p_ui = 0
replace p_ui = incunemp if year>=1988
bysort year serial taxUnit: egen ui = total(p_ui)
*transfers
gen p_transfers = 0
replace p_transfers = incwelfr + incgov    if year>=1968 & year<=1975
replace p_transfers = incwelfr + incgov + incssi if year>=1976 & year<=1987
replace p_transfers = incwelfr + incssi + incwkcom + incvet + incsurv + incdisab + incasist + incchild if year>=1988
bysort year serial taxUnit: egen transfers = total(p_transfers)
*Other items with no info
gen rentpaid = 0
gen proptax = 0
gen otheritem =0
gen childcare = 0
gen mortgage = 0

*Confirm all income accounted for
gen test = pwages + incss + p_div + p_int + p_prop + p_nonprop + p_retire  + p_ui + p_transfers

*Using pwages as the verificaiton rather than its components since drop negatives
gen testinc = .
replace testinc = pwages + incunern if year<=1967
replace testinc = pwages + incss + incwelfr + incgov + incidr + incaloth if year>=1968 & year<=1975
replace testinc = pwages + incss + incwelfr + incgov + incretir + incssi + incdrt + incint + incaloth if year>=1976 & year<=1987
replace testinc = pwages + incss + incwelfr + incretir + incssi + incint + incunemp + incwkcom + incvet + incsurv + incdisab + incdivid + incrent + inceduc + incchild + incalim + incasist + incother if year>=1988 
*Should be no observations from this line.
tab year if test!=testinc

*REPLACING THE YEAR WITH THE INCOME (TAX) YEAR
replace year = year - 1
	
keep year serial taxUnit primaryFiler recordNum year state mstat page sage depx dep13 dep17 dep18 pwages swages dividends intrec stcg ltcg otherprop nonprop pensions gssi ui transfers rentpaid proptax otheritem childcare mortgage

keep if year>=1963
keep if primaryFiler==1
replace dividends = 0 if dividends<0
replace depx = 15 if depx>15
replace dep13 = 15 if dep13>15
replace dep17 = 15 if dep17>15
replace dep18 = 15 if dep18>15

keep recordNum year state mstat page sage depx dep13 dep17 dep18 pwages swages dividends intrec stcg ltcg otherprop nonprop pensions gssi ui transfers rentpaid proptax otheritem childcare mortgage
order recordNum year state mstat page sage depx dep13 dep17 dep18 pwages swages dividends intrec stcg ltcg otherprop nonprop pensions gssi ui transfers rentpaid proptax otheritem childcare mortgage

save taxsim_input, replace

replace state = 0 if year<=1977 /*State taxes not available on TAXSIM before 1978*/
taxsimlocal35,replace
replace fica = fica / 2 /*Since TAXSIM includes employer and employee portion*/
keep recordNum fiitax siitax fica
save taxsim_output, replace

*Simulate state taxes based on 1978 state tax laws
use taxsim_input, clear
drop if year>=1978
replace state = 0  if year>=1967 & year<=1976 /*Specific state IDs not available in CPS-ASEC between 1968 and 1976*/
replace state = 0  if state==.
replace year = 1978
taxsimlocal35,replace
keep recordNum siitax
rename siitax siitax_1978
save taxsim_output_state_1978, replace

***********************************************************
**************Merge with full dataset**********************
***********************************************************

use cps_all2, clear
rename fiitax fiitax_old
rename siitax siitax_old
rename fica fica_old

merge 1:1 recordNum using taxsim_output
recode fiitax (. = 0)
recode siitax (. = 0)
recode fica (. = 0)

drop _merge
merge 1:1 recordNum using taxsim_output_state_1978
recode siitax_1978 (. = 0)

*Want full family value for only one person in family
egen one_per_fam = tag(year serial famunit)
foreach x in foodstamp schllunch houssub {
	replace `x' = 0 if one_per_fam==0
}

replace asecwt = 0 if asecwt<0

save cps_all2, replace

***********************************************************
**************Adjust weights**********************
***********************************************************
use cps_all2, clear

gen povfam = famunit
*Some cases where there is an unrelated subfamily members in the primary family.  Pulling them out (while keeping with any other unrelated subfamily members)
replace povfam = povfam + 0.5 if ftype==4
*Famunit is the same for some secondary individuals and nonfamily householders.  Setting based on person ID to make them all independent individuals
replace povfam = 100+pernum if ftype==2 | ftype==5
	
*Construct size-adjusted family income
*Note: Using the official poverty definitions for family size (long discussion of this on the IPUMS site and the different family options)
rename famsize IPUMS_famsize  // Note that the IPUMS family size differs slightly.
	
sort year serial povfam
by year serial povfam: gen fam_observe = _N
	
*Since 1968 family size equals the number of people we observe
gen famsize = fam_observe if year>=1968
	
*Fix a few missing fammemb
sort year serial povfam
gen fammemb = uh_fnumper_a1 if year<=1967
by year serial povfam: egen temp = max(fammemb)
replace fammemb = temp if year<=1967 & fammemb==.
drop temp
	
*Fix topcoding of family size before 1968
sort year serial povfam
by year serial povfam: gen temp = _n==1
by year serial: egen numfam = total(temp)
drop temp
gen fammemb_recode = fammemb
replace fammemb_recode=uh_numper_a1 if year<=1967 & numfam==1 & fammemb==7 & uh_numper_a1>7
replace famsize = fammemb_recode if year<=1967
	
*FOR WEIGHTING, NEED TO REFLECT THAT WE AREN'T OBSERVING CHILDREN PRIOR TO 1968 SO NEED TO WEIGHT UP TO INCLUDE THEM
gen personWeight = asecwt
replace personWeight = 0 if personWeight<0
replace personWeight = personWeight * famsize/fam_observe if year<=1967
replace personWeight = 0 if personWeight < 0

bysort year serial: gen hhsize = _N

* There are some cases where perinhh==0 for certain individuals.  That doesn't make sense, and perinhh should be the same for all in HH.  So updating to be consistent
* Also keeping hhsize computation for remaining cases where perinhh==0
gen perinhh = uh_numper_a1
bysort year serial: egen temp1 = max(perinhh)
replace perinhh = temp1 if perinhh==0
drop temp1
replace hhsize = perinhh if year<=1967 & perinhh>0

save cps_all2, replace
